---
#loading data
rats <- vroom::vroom(here::here("ca09.mfa2024/data/Rat_Sightings_20231015.csv"),
na = c("", "NA", "N/A")) %>%
janitor::clean_names() %>%
mutate(
# convert the string to a date format
created_date = mdy_hms(created_date),
# just the date without hour info
sighting_date = as.Date(created_date),
# get the year, month, etc
sighting_year = year(created_date),
sighting_month = month(created_date),
sighting_month_name = month(created_date, label = TRUE, abbr = FALSE),
sighting_day = day(created_date),
sighting_weekday = wday(created_date, label = TRUE, abbr = FALSE)) %>%
filter(borough != "Unspecified") %>%
mutate(borough = str_to_title(borough))
#loading the nyc_weather dataset
nyc_weather <- vroom::vroom(here::here("ca09.mfa2024/data/nyc_weather.csv"),
na = c("", "NA", "N/A")) %>%
janitor::clean_names()%>%
mutate(
# convert the string to a date format
date = dmy(datetime),
# get the year, month, etc
year = year(date),
month = month(date),
month_name = month(date, label = TRUE, abbr = FALSE),
day = day(date),
weekday = wday(date, label = TRUE, abbr = FALSE))
#looking at the laoded data
glimpse(rats)
## Rows: 230,028
## Columns: 44
## $ unique_key <dbl> 56242499, 56242808, 57383018, 56610428,…
## $ created_date <dttm> 2022-12-13 18:52:10, 2022-12-13 14:28:…
## $ closed_date <chr> "12/13/2022 06:52:10 PM", "12/13/2022 0…
## $ agency <chr> "DOHMH", "DOHMH", "DOHMH", "DOHMH", "DO…
## $ agency_name <chr> "Department of Health and Mental Hygien…
## $ complaint_type <chr> "Rodent", "Rodent", "Rodent", "Rodent",…
## $ descriptor <chr> "Rat Sighting", "Rat Sighting", "Rat Si…
## $ location_type <chr> "3+ Family Apt. Building", "Other (Expl…
## $ incident_zip <dbl> 10472, 10025, 10026, 11204, 10451, 1122…
## $ incident_address <chr> "1259 BRONX RIVER AVENUE", "741 WEST EN…
## $ street_name <chr> "BRONX RIVER AVENUE", "WEST END AVENUE"…
## $ cross_street_1 <chr> "COLGATE AVENUE", "WEST 96 STREET", "WE…
## $ cross_street_2 <chr> "EAST 172 STREET", "WEST 97 STREET", "W…
## $ intersection_street_1 <chr> "COLGATE AVENUE", "WEST 96 STREET", "…
## $ intersection_street_2 <chr> "EAST 172 STREET", "WEST 97 STREET",…
## $ address_type <chr> "ADDRESS", "ADDRESS", "ADDRESS", "INTER…
## $ city <chr> "BRONX", "NEW YORK", "NEW YORK", "BROOK…
## $ landmark <chr> "BRONX RIVER AVENUE", "WEST END AVENUE"…
## $ facility_type <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ status <chr> "Closed", "Closed", "Closed", "Closed",…
## $ due_date <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ resolution_action_updated_date <chr> "12/13/2022 06:52:10 PM", "12/13/2022 0…
## $ community_board <chr> "09 BRONX", "07 MANHATTAN", "10 MANHATT…
## $ borough <chr> "Bronx", "Manhattan", "Manhattan", "Bro…
## $ x_coordinate_state_plane <dbl> 1016821, 991701, 997727, 989367, 100845…
## $ y_coordinate_state_plane <dbl> 241766, 229075, 232511, 167979, 239668,…
## $ park_facility_name <chr> "Unspecified", "Unspecified", "Unspecif…
## $ park_borough <chr> "BRONX", "MANHATTAN", "MANHATTAN", "BRO…
## $ vehicle_type <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ taxi_company_borough <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ taxi_pick_up_location <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ bridge_highway_name <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ bridge_highway_direction <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ road_ramp <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ bridge_highway_segment <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ latitude <dbl> 40.8, 40.8, 40.8, 40.6, 40.8, 40.7, 40.…
## $ longitude <dbl> -73.9, -74.0, -74.0, -74.0, -73.9, -73.…
## $ location <chr> "(40.83020720581248, -73.88230434536744…
## $ sighting_date <date> 2022-12-13, 2022-12-13, 2023-04-21, 20…
## $ sighting_year <dbl> 2022, 2022, 2023, 2023, 2022, 2023, 202…
## $ sighting_month <dbl> 12, 12, 4, 1, 12, 6, 8, 2, 3, 3, 7, 7, …
## $ sighting_month_name <ord> December, December, April, January, Dec…
## $ sighting_day <int> 13, 13, 21, 24, 13, 30, 14, 24, 22, 22,…
## $ sighting_weekday <ord> Tuesday, Tuesday, Friday, Tuesday, Tues…
skim(rats)
| Name | rats |
| Number of rows | 230028 |
| Number of columns | 44 |
| _______________________ | |
| Column type frequency: | |
| character | 23 |
| Date | 1 |
| factor | 2 |
| logical | 8 |
| numeric | 9 |
| POSIXct | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| closed_date | 17910 | 0.92 | 22 | 22 | 0 | 102336 | 0 |
| agency | 0 | 1.00 | 5 | 5 | 0 | 1 | 0 |
| agency_name | 0 | 1.00 | 39 | 39 | 0 | 1 | 0 |
| complaint_type | 0 | 1.00 | 6 | 6 | 0 | 1 | 0 |
| descriptor | 0 | 1.00 | 12 | 12 | 0 | 1 | 0 |
| location_type | 12 | 1.00 | 5 | 29 | 0 | 49 | 0 |
| incident_address | 10926 | 0.95 | 2 | 52 | 0 | 104381 | 0 |
| street_name | 10927 | 0.95 | 2 | 49 | 0 | 8336 | 0 |
| cross_street_1 | 26355 | 0.89 | 2 | 35 | 0 | 6756 | 0 |
| cross_street_2 | 26348 | 0.89 | 2 | 35 | 0 | 6847 | 0 |
| intersection_street_1 | 118251 | 0.49 | 2 | 32 | 0 | 5782 | 0 |
| intersection_street_2 | 118215 | 0.49 | 3 | 32 | 0 | 5868 | 0 |
| address_type | 4752 | 0.98 | 7 | 12 | 0 | 6 | 0 |
| city | 3647 | 0.98 | 5 | 19 | 0 | 102 | 0 |
| landmark | 138598 | 0.40 | 3 | 32 | 0 | 4074 | 0 |
| status | 0 | 1.00 | 4 | 11 | 0 | 7 | 0 |
| due_date | 97803 | 0.57 | 22 | 22 | 0 | 132168 | 0 |
| resolution_action_updated_date | 9533 | 0.96 | 22 | 22 | 0 | 140301 | 0 |
| community_board | 0 | 1.00 | 8 | 25 | 0 | 73 | 0 |
| borough | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| park_facility_name | 0 | 1.00 | 11 | 11 | 0 | 1 | 0 |
| park_borough | 0 | 1.00 | 5 | 13 | 0 | 5 | 0 |
| location | 2245 | 0.99 | 26 | 40 | 0 | 114670 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| sighting_date | 0 | 1 | 2010-01-01 | 2023-10-14 | 2018-07-03 | 5034 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| sighting_month_name | 0 | 1 | TRUE | 12 | Jul: 25811, Aug: 25551, Jun: 24694, May: 23633 |
| sighting_weekday | 0 | 1 | TRUE | 7 | Mon: 39123, Tue: 38783, Wed: 38196, Thu: 36679 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| facility_type | 230028 | 0 | NaN | : |
| vehicle_type | 230028 | 0 | NaN | : |
| taxi_company_borough | 230028 | 0 | NaN | : |
| taxi_pick_up_location | 230028 | 0 | NaN | : |
| bridge_highway_name | 230028 | 0 | NaN | : |
| bridge_highway_direction | 230028 | 0 | NaN | : |
| road_ramp | 230028 | 0 | NaN | : |
| bridge_highway_segment | 230028 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| unique_key | 0 | 1.00 | 4.00e+07 | 1.23e+07 | 1.15e+07 | 3.03e+07 | 3.96e+07 | 5.16e+07 | 5.91e+07 | ▂▅▆▅▇ |
| incident_zip | 419 | 1.00 | 1.07e+04 | 5.89e+02 | 8.30e+01 | 1.01e+04 | 1.11e+04 | 1.12e+04 | 1.00e+05 | ▇▁▁▁▁ |
| x_coordinate_state_plane | 2245 | 0.99 | 1.00e+06 | 1.84e+04 | 9.13e+05 | 9.93e+05 | 1.00e+06 | 1.01e+06 | 1.07e+06 | ▁▁▇▅▁ |
| y_coordinate_state_plane | 2243 | 0.99 | 2.08e+05 | 2.92e+04 | 1.21e+05 | 1.87e+05 | 2.03e+05 | 2.33e+05 | 2.72e+05 | ▁▃▇▅▃ |
| latitude | 2245 | 0.99 | 4.07e+01 | 8.00e-02 | 4.05e+01 | 4.07e+01 | 4.07e+01 | 4.08e+01 | 4.09e+01 | ▁▃▇▅▃ |
| longitude | 2245 | 0.99 | -7.39e+01 | 7.00e-02 | -7.42e+01 | -7.40e+01 | -7.39e+01 | -7.39e+01 | -7.37e+01 | ▁▁▇▅▁ |
| sighting_year | 0 | 1.00 | 2.02e+03 | 3.90e+00 | 2.01e+03 | 2.02e+03 | 2.02e+03 | 2.02e+03 | 2.02e+03 | ▃▅▃▆▇ |
| sighting_month | 0 | 1.00 | 6.56e+00 | 3.04e+00 | 1.00e+00 | 4.00e+00 | 7.00e+00 | 9.00e+00 | 1.20e+01 | ▇▇▇▇▇ |
| sighting_day | 0 | 1.00 | 1.57e+01 | 8.73e+00 | 1.00e+00 | 8.00e+00 | 1.60e+01 | 2.30e+01 | 3.10e+01 | ▇▇▇▇▆ |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| created_date | 0 | 1 | 2010-01-01 08:29:58 | 2023-10-14 01:55:50 | 2018-07-03 | 113481 |
#Plotting the data
# See the count of rat sightings by weekday
rats %>%
count(sighting_weekday) %>%
kable()
| sighting_weekday | n |
|---|---|
| Sunday | 22529 |
| Monday | 39123 |
| Tuesday | 38783 |
| Wednesday | 38196 |
| Thursday | 36679 |
| Friday | 32558 |
| Saturday | 22160 |
# Assign a summarixed data frame to an object to use it in a plot
rats_by_weekday <- rats %>%
count(sighting_weekday, sighting_year)
ggplot(rats_by_weekday, aes(x = fct_rev(sighting_weekday), y = n)) +
geom_col() +
coord_flip() +
facet_wrap(~ sighting_year)
# See the count of rat sightings by weekday and borough
rats %>%
count(sighting_weekday, borough, sighting_year) %>%
head(10) %>%
kable()
| sighting_weekday | borough | sighting_year | n |
|---|---|---|---|
| Sunday | Bronx | 2010 | 159 |
| Sunday | Bronx | 2011 | 145 |
| Sunday | Bronx | 2012 | 181 |
| Sunday | Bronx | 2013 | 166 |
| Sunday | Bronx | 2014 | 224 |
| Sunday | Bronx | 2015 | 270 |
| Sunday | Bronx | 2016 | 314 |
| Sunday | Bronx | 2017 | 357 |
| Sunday | Bronx | 2018 | 274 |
| Sunday | Bronx | 2019 | 242 |
# An alternative to count() is to specify the groups with group_by() and then
# be explicit about how you're summarising the groups, such as calculating the
# mean, standard deviation, or number of observations (we do that here with
# `n()`).
# rats %>%
# group_by(sighting_weekday, borough) %>%
# summarise(n = n())
#Scatter Plot for sightings per day
rats %>%
group_by(sighting_date)%>%
summarise(sighting_per_day = n())%>%
ggplot(aes(x = sighting_date, y = sighting_per_day ))+
geom_point(alpha = 0.5)+
labs(title = "Number of Rat sightings in NYC per Day",
x = "Days",
y = "Number of Rat Sightings")
Upon closer examination, it appears that rat sightings exhibit a seasonal trend. Specifically, we observe an increase in sightings during the warmer months, typically associated with summer, while during the colder, winter months, when temperatures drop, there is a noticeable decrease. This aligns with our initial hypothesis that rat sightings tend to diminish during winter, as the cold weather often prompts rodents to enter a hibernation-like state, reducing their visibility, and then resurge during the warmer months.
#Bar chart showing Number of Rat sightings in NYC by Month
rats %>%
group_by(sighting_month_name) %>%
summarise(monthly_count = n()) %>%
ggplot(aes(x= sighting_month_name, y = monthly_count)) +
geom_col()+
labs(title = paste("Number of Rat sightings in NYC by Month"),
x = "Month",
y = "Number of Rat Sightings")
#Bar chart showing Number of Rat sightings in NYC by Year
rats %>%
filter(sighting_year < 2023) %>% #filterng 2023 because of incomplete data points
group_by(sighting_year) %>%
summarise(yearly_count = n()) %>%
ggplot(aes(x= sighting_year, y = yearly_count))+
geom_col()+
labs(title = "Number of Rat sightings in NYC by Year",
x = "Year",
y = "Number of Rat Sightings")
We hypothesise that these sightings can also be linked to the number of people in the area, for example 2020 was the year of COVID-19 and with lockdowns in place, the fall in sightings could be because of the number of people outside.
#Function for line chart showing Number of Rat sightings in NYC by chosen Year
sighting_by_month_in_year <- function(sighting_year) {
rats %>%
filter(sighting_year == {{sighting_year}}) %>%
group_by(sighting_month_name) %>%
summarise(monthly_count = n()) %>%
ggplot(aes(x= sighting_month_name, y = monthly_count, group = 1)) +
geom_col()+
labs(title = paste("Number of Rat sightings in NYC by Month in Year", sighting_year),
x = "Month",
y = "Number of Rat Sightings")
}
sighting_by_month_in_year(2022)
# Change in sightings across 5 boroughs from 2015-2022
rats %>%
filter(sighting_year >= 2015 & sighting_year <= 2022)%>%
group_by(borough, sighting_year) %>%
summarise(borough_sightings = n()) %>%
ggplot( aes( x= sighting_year, y = borough_sightings)) +
geom_line()+
facet_wrap(~borough)
#mapping the data
# let's get the top 7 location types, which account for > 90% of all cases
# this code generates a vector with the top 7 location types
top_location_types <- rats %>%
count(location_type, sort=TRUE) %>%
mutate(perc = 100*n/sum(n)) %>%
slice(1:7) %>%
select(location_type) %>%
pull()
# lets us choose how to colour each point. What palette and colours to use?
# A great site to get the relevant color hex codes for maps is
# https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=7
my_colours <- c('#e41a1c','#377eb8','#4daf4a','#984ea3','#ff7f00','#ffff33','#a65628')
# create a function point_fill, that assigns `my_colours` to different location types
# you can read more here https://rstudio.github.io/leaflet/colors.html
point_fill <- colorFactor(palette = my_colours,
rats$location_type)
rats %>%
filter(sighting_year == 2022) %>% # just show 2022 data
filter(location_type %in% top_location_types) %>%
leaflet() %>%
addProviderTiles("OpenStreetMap.Mapnik") %>%
addCircleMarkers(lng = ~longitude,
lat = ~latitude,
radius = 1,
color = ~point_fill(location_type),
fillOpacity = 0.6,
popup = ~created_date,
label = ~location_type) %>%
addLegend("bottomright", pal = point_fill,
values = ~location_type,
title = "2022 Location Type",
opacity = 0.5)
#rat sightings vs a borough’s human population
# https://en.wikipedia.org/wiki/Boroughs_of_New_York_City
# NYC Boroughs, 2020 census data
# Bronx 1,472,654
# Brooklyn 2,736,074
# Manhattan 1,694,263
# Queens 2,405,464
# Staten Island 495,747
nyc_population <- tibble(
borough = c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island"),
population = c(1472654,2736074,1694263,2405464,495747))
library(ggtext)
# your ggplot code +
labs(
title = "Mouse sightings don't always track a <span style='color:#FFA500'><b>borough's population</span></b>"
) +
theme(
plot.title.position = "plot",
plot.title = element_textbox_simple(size=16)) +
NULL
## integer(0)
# Calculate population proportion for each borough
nyc_population <- nyc_population %>%
mutate(population_proportion = population / sum(population))
# Subset the data for the years 2014-2022
filtered_rats <- rats %>%
filter(sighting_year >= 2014 & sighting_year <= 2022)
# Calculate total sightings each year
total_sightings_by_year <- filtered_rats %>%
group_by(sighting_year)%>%
summarise(total_sightings_by_year = n())
# Combine three datasets and calculate proportions and CI for each borough and year
rat_sightings_by_borough_by_year <- filtered_rats %>%
group_by(borough, sighting_year) %>%
summarise(total_sightings_by_borough_by_year = n()) %>%
left_join(nyc_population, by = "borough") %>%
left_join(total_sightings_by_year, by = "sighting_year") %>%
mutate(
proportion = total_sightings_by_borough_by_year / total_sightings_by_year,
# Calculate the Margin of Error
margin_of_error = 1.96 * sqrt(proportion * (1 - proportion) / total_sightings_by_borough_by_year),
lower_CI = proportion - margin_of_error,
upper_CI = proportion + margin_of_error,
)%>%
group_by(sighting_year) %>%
arrange(population_proportion)
# Plot the graph
ggplot(rat_sightings_by_borough_by_year, aes(y = reorder(borough, -desc(population_proportion)), x = proportion, xmin = lower_CI, xmax = upper_CI)) +
geom_errorbar(width = 0.2) +
geom_point(aes(x = population_proportion), size = 5, color = "#FFA500") +
facet_wrap(~sighting_year, scales = "free_y")+
labs(
title = "Mouse sightings don't always track a <span style='color:#FFA500'><b>borough's population</span></b>",
x = NULL,
y = NULL
) +
scale_x_continuous(labels = scales::percent)+
theme(panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_line(colour = "grey"),
panel.background = element_rect(fill = "white"),
plot.background = element_rect(fill = "white"))+
theme(
plot.title.position = "plot",
plot.title = element_textbox_simple(size=16)) +
NULL
library(tidyquant)
sp500 <- "SPY" %>%
tq_get(get = "stock.prices",
from = "2010-01-01",
to = Sys.Date()) # today's date
sp500_returns_daily <- sp500 %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "daily",
type = "log",
col_rename = "daily_return")
rats %>%
group_by(borough, sighting_date) %>%
summarise(count = n()) %>%
left_join(sp500_returns_daily,
by = c("sighting_date" = "date")) %>%
mutate(sp500_up_down = ifelse(daily_return > 0, "up", "down")) %>%
group_by(borough, sp500_up_down) %>%
summarise(
mean_n = mean(count),
sd_n = sd(count),
count = n(),
se_n = sd_n / sqrt(count),
lower_95 = mean_n - (1.96 * se_n),
upper_95 = mean_n + (1.96 * se_n))
## # A tibble: 15 × 8
## # Groups: borough [5]
## borough sp500_up_down mean_n sd_n count se_n lower_95 upper_95
## <chr> <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Bronx down 9.83 4.83 1555 0.123 9.59 10.1
## 2 Bronx up 9.53 4.86 1902 0.111 9.31 9.75
## 3 Bronx <NA> 5.59 3.43 1528 0.0878 5.42 5.77
## 4 Brooklyn down 19.3 10.8 1565 0.274 18.8 19.9
## 5 Brooklyn up 18.9 10.9 1901 0.251 18.4 19.4
## 6 Brooklyn <NA> 11.2 7.17 1554 0.182 10.8 11.5
## 7 Manhattan down 14.0 7.41 1564 0.187 13.6 14.3
## 8 Manhattan up 13.3 7.05 1897 0.162 13.0 13.7
## 9 Manhattan <NA> 8.80 5.54 1545 0.141 8.53 9.08
## 10 Queens down 8.11 4.93 1540 0.126 7.86 8.36
## 11 Queens up 7.95 4.99 1880 0.115 7.73 8.18
## 12 Queens <NA> 4.86 3.38 1463 0.0883 4.69 5.03
## 13 Staten Island down 2.69 1.86 1242 0.0529 2.59 2.79
## 14 Staten Island up 2.69 1.83 1490 0.0474 2.60 2.78
## 15 Staten Island <NA> 2.00 1.43 935 0.0469 1.90 2.09
#4. Regression
#Investigate distribution of rat sightings, independent of borough
rats_by_day_noboroughs <- rats %>%
group_by(sighting_date) %>%
summarise(nr_of_sightings = n())
#Distribution of sightings
ggplot(rats_by_day_noboroughs, aes(x=nr_of_sightings)) +
geom_histogram(binwidth = 10)
#Distribution of log of rat sightings
ggplot(rats_by_day_noboroughs, aes(x=log(nr_of_sightings))) +
geom_histogram(binwidth = 0.1)
#Summarise the rat sightings by day
rats_by_day <- rats %>%
group_by(borough, sighting_date) %>%
summarise(nr_of_sightings = n())
#Joining the rat sightings data with the weather data
rats_weather <- left_join(rats_by_day, nyc_weather, by = c("sighting_date" = "date"))
# import weather dataset
weather <- read_csv(here::here("ca09.mfa2024/data", "nyc_weather.csv"), na = "NA", show_col_types = FALSE)
# modify date variable using lubridate
weather$datetime <- lubridate::dmy(weather$datetime)
# creating the dataset for the single-variable regression
# join rats table with weather data
nyc <-
left_join(rats, weather, c("sighting_date"="datetime"))
# count sightings per day
nyc_sighting <- nyc %>%
group_by(sighting_date) %>%
summarise(sighting_count = n())
# select only unique values in the weather dataset to remove
nyc_temp <- nyc %>%
select(sighting_date, temp) %>%
distinct()
# join temprature and sighting per day
nyc_simple_reg <-
left_join(nyc_sighting, nyc_temp, by="sighting_date")
#group_by(sighting_date)
glimpse(nyc_simple_reg)
## Rows: 5,034
## Columns: 3
## $ sighting_date <date> 2010-01-01, 2010-01-02, 2010-01-03, 2010-01-04, 2010-0…
## $ sighting_count <int> 9, 12, 3, 24, 14, 21, 15, 13, 10, 7, 25, 16, 24, 15, 19…
## $ temp <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
nyc_w = nyc_weather %>%
mutate(sighting_date = datetime)
rats_sighted = rats %>%
group_by(sighting_date, borough, sighting_weekday, sighting_month_name,
sighting_day,
sighting_year) %>%
summarize(mouse_sighted = n(),)
## `summarise()` has grouped output by 'sighting_date', 'borough',
## 'sighting_weekday', 'sighting_month_name', 'sighting_day'. You can override
## using the `.groups` argument.
lm_data = left_join(rats_sighted, nyc_w, by="sighting_date")
lm_data_na_rm = na.omit(data.frame(lm_data) %>%
dplyr::select(sighting_date, mouse_sighted, borough, sighting_year, sighting_day,
sighting_weekday, sighting_month_name, temp, humidity,
visibility, precip))
lm_data_na_rm_log = lm_data_na_rm %>%
mutate (log_times = log(mouse_sighted)) %>%
mutate (sighting_weekday = as.character(sighting_weekday)) %>%
mutate (sighting_month_name = as.character(sighting_month_name))
head(lm_data_na_rm_log)
## sighting_date mouse_sighted borough sighting_year sighting_day
## 1 2010-01-01 2 Bronx 2010 1
## 2 2010-01-01 4 Brooklyn 2010 1
## 3 2010-01-01 1 Manhattan 2010 1
## 4 2010-01-01 2 Queens 2010 1
## 5 2010-01-02 3 Bronx 2010 2
## 6 2010-01-02 4 Brooklyn 2010 2
## sighting_weekday sighting_month_name temp humidity visibility precip
## 1 Friday January 2.4 82.5 10.6 2.40
## 2 Friday January 2.4 82.5 10.6 2.40
## 3 Friday January 2.4 82.5 10.6 2.40
## 4 Friday January 2.4 82.5 10.6 2.40
## 5 Saturday January -3.5 60.2 14.3 0.18
## 6 Saturday January -3.5 60.2 14.3 0.18
## log_times
## 1 0.693
## 2 1.386
## 3 0.000
## 4 0.693
## 5 1.099
## 6 1.386
lm_data_na_rm_log %>%
ggplot(aes(x = mouse_sighted, fill = "red", col = "blue"))+
geom_histogram(binwidth = 1)+
theme_bw()+
labs (x = "Number of Mouse Sighted",
y = "Count",
title = "Disrtibution of Mouse Sighted (in each location on each day)",
subtitle = paste("Skewness = ",
round(skewness(lm_data_na_rm_log$mouse_sighted),4),
sep =""))+
theme(legend.position = "none")
lm_data_na_rm_log %>%
ggplot(aes(x = log_times, fill="red"))+
geom_histogram(binwidth=0.1)+
theme_bw()+
labs (x = "Number of Logged Mouse Sighted",
y = "Count",
title = "Disrtibution of Logged Mouse Sighted (in each location on each day)",
subtitle = paste("Skewness = ",
round(skewness(lm_data_na_rm_log$log_times),4),
sep =""))+
theme(legend.position = "none")
model1 = lm(data = lm_data_na_rm_log, log_times~temp)
msummary(model1)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5197 0.0117 129.4 <2e-16 ***
## temp 0.0246 0.0007 35.2 <2e-16 ***
##
## Residual standard error: 0.876 on 17376 degrees of freedom
## Multiple R-squared: 0.0664, Adjusted R-squared: 0.0664
## F-statistic: 1.24e+03 on 1 and 17376 DF, p-value: <2e-16
model2 = lm(data = lm_data_na_rm_log, log_times ~ temp + borough)
msummary(model2)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.546810 0.012695 121.8 <2e-16 ***
## temp 0.028700 0.000516 55.6 <2e-16 ***
## boroughBrooklyn 0.547974 0.014970 36.6 <2e-16 ***
## boroughManhattan 0.302432 0.014982 20.2 <2e-16 ***
## boroughQueens -0.325024 0.015103 -21.5 <2e-16 ***
## boroughStaten Island -1.287073 0.016384 -78.5 <2e-16 ***
##
## Residual standard error: 0.644 on 17372 degrees of freedom
## Multiple R-squared: 0.495, Adjusted R-squared: 0.495
## F-statistic: 3.41e+03 on 5 and 17372 DF, p-value: <2e-16
model_n = lm(data = lm_data_na_rm_log, mouse_sighted~temp + borough)
msummary(model_n)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.13049 0.12287 41.8 <2e-16 ***
## temp 0.24074 0.00499 48.2 <2e-16 ***
## boroughBrooklyn 6.84463 0.14489 47.2 <2e-16 ***
## boroughManhattan 3.15229 0.14500 21.7 <2e-16 ***
## boroughQueens -1.98048 0.14617 -13.6 <2e-16 ***
## boroughStaten Island -6.36670 0.15857 -40.1 <2e-16 ***
##
## Residual standard error: 6.23 on 17372 degrees of freedom
## Multiple R-squared: 0.37, Adjusted R-squared: 0.369
## F-statistic: 2.04e+03 on 5 and 17372 DF, p-value: <2e-16
performance::check_model(model1)
performance::check_model(model2)
car::vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## temp 1 1 1
## borough 1 4 1
library(huxtable)
summary_table = huxtable::huxreg(model1, model2,
statistics = c('No. observations' = 'nobs',
'R^2' = 'r.squared',
"Adj. R^2" = "adj.r.squared"),
bold_signif = 0.05) %>%
set_caption("Two-Model Result")
summary_table
| (1) | (2) | |
|---|---|---|
| (Intercept) | 1.520 *** | 1.547 *** |
| (0.012) | (0.013) | |
| temp | 0.025 *** | 0.029 *** |
| (0.001) | (0.001) | |
| boroughBrooklyn | 0.548 *** | |
| (0.015) | ||
| boroughManhattan | 0.302 *** | |
| (0.015) | ||
| boroughQueens | -0.325 *** | |
| (0.015) | ||
| boroughStaten Island | -1.287 *** | |
| (0.016) | ||
| No. observations | 17378 | 17378 |
| R^2 | 0.066 | 0.495 |
| Adj. R^2 | 0.066 | 0.495 |
| *** p < 0.001; ** p < 0.01; * p < 0.05. | ||